1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.lucene.analysis.cn.smart.hhmm;
19  
20  import java.io.DataInputStream;
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.io.ObjectInputStream;
24  import java.io.ObjectOutputStream;
25  import java.nio.ByteBuffer;
26  import java.nio.ByteOrder;
27  import java.nio.file.Files;
28  import java.nio.file.Path;
29  import java.nio.file.Paths;
30  
31  import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
32  import org.apache.lucene.analysis.cn.smart.Utility;
33  
34  /**
35   * SmartChineseAnalyzer Word Dictionary
36   * @lucene.experimental
37   */
38  class WordDictionary extends AbstractDictionary {
39  
40    private WordDictionary() {
41    }
42  
43    private static WordDictionary singleInstance;
44  
45    /**
46     * Large prime number for hash function
47     */
48    public static final int PRIME_INDEX_LENGTH = 12071;
49  
50    /**
51     * wordIndexTable guarantees to hash all Chinese characters in Unicode into 
52     * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this 
53     * program only handles the 6768 characters found in GB2312 plus some 
54     * ASCII characters. Therefore in order to guarantee better precision, it is
55     * necessary to retain the original symbol in the charIndexTable.
56     */
57    private short[] wordIndexTable;
58  
59    private char[] charIndexTable;
60  
61    /**
62     * To avoid taking too much space, the data structure needed to store the 
63     * lexicon requires two multidimensional arrays to store word and frequency.
64     * Each word is placed in a char[]. Each char represents a Chinese char or 
65     * other symbol.  Each frequency is put into an int. These two arrays 
66     * correspond to each other one-to-one. Therefore, one can use 
67     * wordItem_charArrayTable[i][j] to look up word from lexicon, and 
68     * wordItem_frequencyTable[i][j] to look up the corresponding frequency. 
69     */
70    private char[][][] wordItem_charArrayTable;
71  
72    private int[][] wordItem_frequencyTable;
73  
74    // static Logger log = Logger.getLogger(WordDictionary.class);
75  
76    /**
77     * Get the singleton dictionary instance.
78     * @return singleton
79     */
80    public synchronized static WordDictionary getInstance() {
81      if (singleInstance == null) {
82        singleInstance = new WordDictionary();
83        try {
84          singleInstance.load();
85        } catch (IOException e) {
86          String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
87          singleInstance.load(wordDictRoot);
88        } catch (ClassNotFoundException e) {
89          throw new RuntimeException(e);
90        }
91      }
92      return singleInstance;
93    }
94  
95    /**
96     * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
97     * 
98     * @param dctFileRoot path to dictionary directory
99     */
100   public void load(String dctFileRoot) {
101     String dctFilePath = dctFileRoot + "/coredict.dct";
102     Path serialObj = Paths.get(dctFileRoot + "/coredict.mem");
103 
104     if (Files.exists(serialObj) && loadFromObj(serialObj)) {
105 
106     } else {
107       try {
108         wordIndexTable = new short[PRIME_INDEX_LENGTH];
109         charIndexTable = new char[PRIME_INDEX_LENGTH];
110         for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
111           charIndexTable[i] = 0;
112           wordIndexTable[i] = -1;
113         }
114         wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
115         wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
116         // int total =
117         loadMainDataFromFile(dctFilePath);
118         expandDelimiterData();
119         mergeSameWords();
120         sortEachItems();
121         // log.info("load dictionary: " + dctFilePath + " total:" + total);
122       } catch (IOException e) {
123         throw new RuntimeException(e.getMessage());
124       }
125 
126       saveToObj(serialObj);
127     }
128 
129   }
130 
131   /**
132    * Load coredict.mem internally from the jar file.
133    * 
134    * @throws IOException If there is a low-level I/O error.
135    */
136   public void load() throws IOException, ClassNotFoundException {
137     InputStream input = this.getClass().getResourceAsStream("coredict.mem");
138     loadFromObjectInputStream(input);
139   }
140 
141   private boolean loadFromObj(Path serialObj) {
142     try {
143       loadFromObjectInputStream(Files.newInputStream(serialObj));
144       return true;
145     } catch (Exception e) {
146       throw new RuntimeException(e);
147     }
148   }
149 
150   private void loadFromObjectInputStream(InputStream serialObjectInputStream)
151       throws IOException, ClassNotFoundException {
152     ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
153     wordIndexTable = (short[]) input.readObject();
154     charIndexTable = (char[]) input.readObject();
155     wordItem_charArrayTable = (char[][][]) input.readObject();
156     wordItem_frequencyTable = (int[][]) input.readObject();
157     // log.info("load core dict from serialization.");
158     input.close();
159   }
160 
161   private void saveToObj(Path serialObj) {
162     try {
163       ObjectOutputStream output = new ObjectOutputStream(Files.newOutputStream(
164           serialObj));
165       output.writeObject(wordIndexTable);
166       output.writeObject(charIndexTable);
167       output.writeObject(wordItem_charArrayTable);
168       output.writeObject(wordItem_frequencyTable);
169       output.close();
170       // log.info("serialize core dict.");
171     } catch (Exception e) {
172       // log.warn(e.getMessage());
173     }
174   }
175 
176   /**
177    * Load the datafile into this WordDictionary
178    * 
179    * @param dctFilePath path to word dictionary (coredict.dct)
180    * @return number of words read
181    * @throws IOException If there is a low-level I/O error.
182    */
183   private int loadMainDataFromFile(String dctFilePath) throws IOException {
184     int i, cnt, length, total = 0;
185     // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
186     // The 3756th is used (as a header) to store information.
187     int[] buffer = new int[3];
188     byte[] intBuffer = new byte[4];
189     String tmpword;
190     DataInputStream dctFile = new DataInputStream(Files.newInputStream(Paths.get(dctFilePath)));
191 
192     // GB2312 characters 0 - 6768
193     for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
194       // if (i == 5231)
195       // System.out.println(i);
196 
197       dctFile.read(intBuffer);
198       // the dictionary was developed for C, and byte order must be converted to work with Java
199       cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
200       if (cnt <= 0) {
201         wordItem_charArrayTable[i] = null;
202         wordItem_frequencyTable[i] = null;
203         continue;
204       }
205       wordItem_charArrayTable[i] = new char[cnt][];
206       wordItem_frequencyTable[i] = new int[cnt];
207       total += cnt;
208       int j = 0;
209       while (j < cnt) {
210         // wordItemTable[i][j] = new WordItem();
211         dctFile.read(intBuffer);
212         buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
213             .getInt();// frequency
214         dctFile.read(intBuffer);
215         buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
216             .getInt();// length
217         dctFile.read(intBuffer);
218         buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
219             .getInt();// handle
220 
221         // wordItemTable[i][j].frequency = buffer[0];
222         wordItem_frequencyTable[i][j] = buffer[0];
223 
224         length = buffer[1];
225         if (length > 0) {
226           byte[] lchBuffer = new byte[length];
227           dctFile.read(lchBuffer);
228           tmpword = new String(lchBuffer, "GB2312");
229           // indexTable[i].wordItems[j].word = tmpword;
230           // wordItemTable[i][j].charArray = tmpword.toCharArray();
231           wordItem_charArrayTable[i][j] = tmpword.toCharArray();
232         } else {
233           // wordItemTable[i][j].charArray = null;
234           wordItem_charArrayTable[i][j] = null;
235         }
236         // System.out.println(indexTable[i].wordItems[j]);
237         j++;
238       }
239 
240       String str = getCCByGB2312Id(i);
241       setTableIndex(str.charAt(0), i);
242     }
243     dctFile.close();
244     return total;
245   }
246 
247   /**
248    * The original lexicon puts all information with punctuation into a 
249    * chart (from 1 to 3755). Here it then gets expanded, separately being
250    * placed into the chart that has the corresponding symbol.
251    */
252   private void expandDelimiterData() {
253     int i;
254     int cnt;
255     // Punctuation then treating index 3755 as 1, 
256     // distribute the original punctuation corresponding dictionary into 
257     int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
258     i = 0;
259     while (i < wordItem_charArrayTable[delimiterIndex].length) {
260       char c = wordItem_charArrayTable[delimiterIndex][i][0];
261       int j = getGB2312Id(c);// the id value of the punctuation
262       if (wordItem_charArrayTable[j] == null) {
263 
264         int k = i;
265         // Starting from i, count the number of the following worditem symbol from j
266         while (k < wordItem_charArrayTable[delimiterIndex].length
267             && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
268           k++;
269         }
270         // c is the punctuation character, j is the id value of c
271         // k-1 represents the index of the last punctuation character
272         cnt = k - i;
273         if (cnt != 0) {
274           wordItem_charArrayTable[j] = new char[cnt][];
275           wordItem_frequencyTable[j] = new int[cnt];
276         }
277 
278         // Assign value for each wordItem.
279         for (k = 0; k < cnt; k++, i++) {
280           // wordItemTable[j][k] = new WordItem();
281           wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
282           wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
283           System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
284               wordItem_charArrayTable[j][k], 0,
285               wordItem_charArrayTable[j][k].length);
286         }
287         setTableIndex(c, j);
288       }
289     }
290     // Delete the original corresponding symbol array.
291     wordItem_charArrayTable[delimiterIndex] = null;
292     wordItem_frequencyTable[delimiterIndex] = null;
293   }
294 
295   /*
296    * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
297    */
298   private void mergeSameWords() {
299     int i;
300     for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
301       if (wordItem_charArrayTable[i] == null)
302         continue;
303       int len = 1;
304       for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
305         if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
306             wordItem_charArrayTable[i][j - 1], 0) != 0)
307           len++;
308 
309       }
310       if (len < wordItem_charArrayTable[i].length) {
311         char[][] tempArray = new char[len][];
312         int[] tempFreq = new int[len];
313         int k = 0;
314         tempArray[0] = wordItem_charArrayTable[i][0];
315         tempFreq[0] = wordItem_frequencyTable[i][0];
316         for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
317           if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
318               tempArray[k], 0) != 0) {
319             k++;
320             // temp[k] = wordItemTable[i][j];
321             tempArray[k] = wordItem_charArrayTable[i][j];
322             tempFreq[k] = wordItem_frequencyTable[i][j];
323           } else {
324             // temp[k].frequency += wordItemTable[i][j].frequency;
325             tempFreq[k] += wordItem_frequencyTable[i][j];
326           }
327         }
328         // wordItemTable[i] = temp;
329         wordItem_charArrayTable[i] = tempArray;
330         wordItem_frequencyTable[i] = tempFreq;
331       }
332     }
333   }
334 
335   private void sortEachItems() {
336     char[] tmpArray;
337     int tmpFreq;
338     for (int i = 0; i < wordItem_charArrayTable.length; i++) {
339       if (wordItem_charArrayTable[i] != null
340           && wordItem_charArrayTable[i].length > 1) {
341         for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
342           for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
343             if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
344                 wordItem_charArrayTable[i][j2], 0) > 0) {
345               tmpArray = wordItem_charArrayTable[i][j];
346               tmpFreq = wordItem_frequencyTable[i][j];
347               wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
348               wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
349               wordItem_charArrayTable[i][j2] = tmpArray;
350               wordItem_frequencyTable[i][j2] = tmpFreq;
351             }
352           }
353         }
354       }
355     }
356   }
357 
358   /*
359    * Calculate character c's position in hash table, 
360    * then initialize the value of that position in the address table.
361    */
362   private boolean setTableIndex(char c, int j) {
363     int index = getAvaliableTableIndex(c);
364     if (index != -1) {
365       charIndexTable[index] = c;
366       wordIndexTable[index] = (short) j;
367       return true;
368     } else
369       return false;
370   }
371 
372   private short getAvaliableTableIndex(char c) {
373     int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
374     int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
375     if (hash1 < 0)
376       hash1 = PRIME_INDEX_LENGTH + hash1;
377     if (hash2 < 0)
378       hash2 = PRIME_INDEX_LENGTH + hash2;
379     int index = hash1;
380     int i = 1;
381     while (charIndexTable[index] != 0 && charIndexTable[index] != c
382         && i < PRIME_INDEX_LENGTH) {
383       index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
384       i++;
385     }
386     // System.out.println(i - 1);
387 
388     if (i < PRIME_INDEX_LENGTH
389         && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
390       return (short) index;
391     } else
392       return -1;
393   }
394 
395   private short getWordItemTableIndex(char c) {
396     int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
397     int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
398     if (hash1 < 0)
399       hash1 = PRIME_INDEX_LENGTH + hash1;
400     if (hash2 < 0)
401       hash2 = PRIME_INDEX_LENGTH + hash2;
402     int index = hash1;
403     int i = 1;
404     while (charIndexTable[index] != 0 && charIndexTable[index] != c
405         && i < PRIME_INDEX_LENGTH) {
406       index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
407       i++;
408     }
409 
410     if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
411       return (short) index;
412     } else
413       return -1;
414   }
415 
416   /**
417    * Look up the text string corresponding with the word char array, 
418    * and return the position of the word list.
419    * 
420    * @param knownHashIndex already figure out position of the first word 
421    *   symbol charArray[0] in hash table. If not calculated yet, can be 
422    *   replaced with function int findInTable(char[] charArray).
423    * @param charArray look up the char array corresponding with the word.
424    * @return word location in word array.  If not found, then return -1.
425    */
426   private int findInTable(short knownHashIndex, char[] charArray) {
427     if (charArray == null || charArray.length == 0)
428       return -1;
429 
430     char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
431     int start = 0, end = items.length - 1;
432     int mid = (start + end) / 2, cmpResult;
433 
434     // Binary search for the index of idArray
435     while (start <= end) {
436       cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
437 
438       if (cmpResult == 0)
439         return mid;// find it
440       else if (cmpResult < 0)
441         start = mid + 1;
442       else if (cmpResult > 0)
443         end = mid - 1;
444 
445       mid = (start + end) / 2;
446     }
447     return -1;
448   }
449 
450   /**
451    * Find the first word in the dictionary that starts with the supplied prefix
452    * 
453    * @see #getPrefixMatch(char[], int)
454    * @param charArray input prefix
455    * @return index of word, or -1 if not found
456    */
457   public int getPrefixMatch(char[] charArray) {
458     return getPrefixMatch(charArray, 0);
459   }
460 
461   /**
462    * Find the nth word in the dictionary that starts with the supplied prefix
463    * 
464    * @see #getPrefixMatch(char[])
465    * @param charArray input prefix
466    * @param knownStart relative position in the dictionary to start
467    * @return index of word, or -1 if not found
468    */
469   public int getPrefixMatch(char[] charArray, int knownStart) {
470     short index = getWordItemTableIndex(charArray[0]);
471     if (index == -1)
472       return -1;
473     char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
474     int start = knownStart, end = items.length - 1;
475 
476     int mid = (start + end) / 2, cmpResult;
477 
478     // Binary search for the index of idArray
479     while (start <= end) {
480       cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
481       if (cmpResult == 0) {
482         // Get the first item which match the current word
483         while (mid >= 0
484             && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
485           mid--;
486         mid++;
487         return mid;// Find the first word that uses charArray as prefix.
488       } else if (cmpResult < 0)
489         end = mid - 1;
490       else
491         start = mid + 1;
492       mid = (start + end) / 2;
493     }
494     return -1;
495   }
496 
497   /**
498    * Get the frequency of a word from the dictionary
499    * 
500    * @param charArray input word
501    * @return word frequency, or zero if the word is not found
502    */
503   public int getFrequency(char[] charArray) {
504     short hashIndex = getWordItemTableIndex(charArray[0]);
505     if (hashIndex == -1)
506       return 0;
507     int itemIndex = findInTable(hashIndex, charArray);
508     if (itemIndex != -1)
509       return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
510     return 0;
511 
512   }
513 
514   /**
515    * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
516    * 
517    * @param charArray input word
518    * @param itemIndex item index for table charArray[0]
519    * @return true if the entry exists
520    */
521   public boolean isEqual(char[] charArray, int itemIndex) {
522     short hashIndex = getWordItemTableIndex(charArray[0]);
523     return Utility.compareArray(charArray, 1,
524         wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
525   }
526 
527 }